#########
## Nature Human Behaviour 
## Leading Countries in Global Science Increasingly Receive More Citations than Other Countries Despite Doing Similar Research.
## https://doi.org/10.1038/s41562-022-01351-5
## Harvard Dataverse (Code and Metadata): https://doi.org/10.7910/DVN/WCOINR 
## Step 3
## Data: Data_20210905
#########

# At a terminal, run: 
# ml python/3.6.1
# ml py-ipython/6.1.0_py36 python/3.6.1 py-scipy/1.1.0_py36 py-scikit-learn/0.19.1_py36 py-pandas/0.23.0_py36 gcc/10.1.0 py-pytorch/1.4.0_py36
# ml py-numpy/1.17.2_py36
# export PYTHONPATH=$GROUP_HOME/python/lib/python3.6/site-packages:$PYTHONPATH
# srun python3 -u Step_X3_Python3_RR_MAG_Country_Vignette.py

#############################
### Time Start
#############################
import time 
start_time = time.time()

#############################
### Modules
#############################
import glob as glob
from pyathena import connect
import pandas as pd
import json
import numpy as np
import string
import gc 
import sys
from collections import Counter
import random
import bz2 
import pickle
import itertools
import multiprocessing as mp
from os import path
import os 
import random
import time 
import re

import itertools
import psutil

#############################
#### Read in Regional DF
#############################
regional_df = pd.read_csv("INPUT_R_RR_Nation_to_Regional_Classification.csv")

list_of_edgelists = glob.glob("OUTPUT_Python_Field_KLD_Citation_Edgelist_Corpus_RAKE_and_GoogleAPI_EnglishOnly_*")
required_columns = ["Receiver","Sender","Discipline","Year","Percentile_Cutoff","Citation_Window","Citation_Type","Journal_Censoring","Delta_Citation_ij","Delta_Citation_ji"]

list_of_df_edgelists = []

for file_ in list_of_edgelists:

	#for focal countries:
	#t1df = pd.read_csv(file_,usecols=required_columns,dtype={"Year": "int","Citation_Window":"category","Percentile_Cutoff":"category"}).query("Receiver in @focal_countries")
	t1df = pd.read_csv(file_,usecols=required_columns,dtype={"Year": "int","Citation_Window":"category","Percentile_Cutoff":"category"})
	t1df = pd.merge(t1df,regional_df,left_on=["Receiver"],right_on=["Nation"]).drop(columns=["Nation"]).rename(columns={"Region":"Receiver_Region"})
	t1df = pd.merge(t1df,regional_df,left_on=["Sender"],right_on=["Nation"]).drop(columns=["Nation"]).rename(columns={"Region":"Sender_Region"})
	t1df["Discipline"] = file_.split("_")[-1].split(".")[0]
	t1df = t1df.groupby(["Sender_Region","Receiver_Region","Year","Discipline","Citation_Type","Journal_Censoring","Percentile_Cutoff"]).agg({"Delta_Citation_ij":"mean","Delta_Citation_ji":"mean"}).reset_index()
	list_of_df_edgelists.append(t1df)
	# t1df.info(verbose=False, memory_usage="deep")

edgelist_final = pd.concat(list_of_df_edgelists)

#############################
### Output Filename 
#############################

Final_Edgelist_Filename = "OUTPUT_Python_MAG_RR_Delta_Regional_Vignette.csv.gz"

edgelist_final.to_csv(Final_Edgelist_Filename,index=False,compression='gzip')